# import requirement libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
# for solve problem of show plotly plots
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
# for better plot visualization
plt.style.use('_mpl-gallery')
FONT = {'fontsize':20, 'fontstyle':'normal', 'fontfamily':'Times New Roman', 'backgroundcolor':'#145A32', 'color':'orange'} # for plot title
# import requirement sklearn functions
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, jaccard_score, log_loss
df = pd.read_csv("/Users/davoo/OneDrive/Desktop/Applied Data Mining (ADS-502-01)/W3/Website Data Sets/bank_loan.csv", sep=",")
df
| ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91108 | 4 | 1/60 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1/50 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1/00 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2/70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1/00 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1/90 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0/40 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0/30 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0/50 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0/80 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 14 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIP Code 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null object 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal Loan 5000 non-null int64 10 Securities Account 5000 non-null int64 11 CD Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: int64(13), object(1) memory usage: 547.0+ KB
df.describe()
| ID | Age | Experience | Income | ZIP Code | Family | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.00000 | 5000.000000 | 5000.000000 |
| mean | 2500.500000 | 45.338400 | 20.104600 | 73.774200 | 93152.503200 | 2.396400 | 1.881000 | 56.498800 | 0.096000 | 0.104400 | 0.06040 | 0.596800 | 0.294000 |
| std | 1443.520003 | 11.463166 | 11.467954 | 46.033729 | 2121.852005 | 1.147663 | 0.839869 | 101.713802 | 0.294621 | 0.305809 | 0.23825 | 0.490589 | 0.455637 |
| min | 1.000000 | 23.000000 | -3.000000 | 8.000000 | 9307.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
| 25% | 1250.750000 | 35.000000 | 10.000000 | 39.000000 | 91911.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
| 50% | 2500.500000 | 45.000000 | 20.000000 | 64.000000 | 93437.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 0.000000 |
| 75% | 3750.250000 | 55.000000 | 30.000000 | 98.000000 | 94608.000000 | 3.000000 | 3.000000 | 101.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 1.000000 |
| max | 5000.000000 | 67.000000 | 43.000000 | 224.000000 | 96651.000000 | 4.000000 | 3.000000 | 635.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 |
df.describe(include="O")
| CCAvg | |
|---|---|
| count | 5000 |
| unique | 108 |
| top | 0/30 |
| freq | 241 |
df.isnull().sum()
ID 0 Age 0 Experience 0 Income 0 ZIP Code 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal Loan 0 Securities Account 0 CD Account 0 Online 0 CreditCard 0 dtype: int64
df['CCAvg'] = df['CCAvg'].str.replace('/', '.').astype('float64')
df
| ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91108 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 14 columns
df.dtypes
ID int64 Age int64 Experience int64 Income int64 ZIP Code int64 Family int64 CCAvg float64 Education int64 Mortgage int64 Personal Loan int64 Securities Account int64 CD Account int64 Online int64 CreditCard int64 dtype: object
round(df.describe().T, 2)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.50 | 1443.52 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.34 | 11.46 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.10 | 11.47 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.77 | 46.03 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIP Code | 5000.0 | 93152.50 | 2121.85 | 9307.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.40 | 1.15 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.94 | 1.75 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.88 | 0.84 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.50 | 101.71 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal Loan | 5000.0 | 0.10 | 0.29 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities Account | 5000.0 | 0.10 | 0.31 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD Account | 5000.0 | 0.06 | 0.24 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.60 | 0.49 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.29 | 0.46 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
df[df['Experience'] < 0]
| ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 89 | 90 | 25 | -1 | 113 | 94303 | 4 | 2.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 226 | 227 | 24 | -1 | 39 | 94085 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 315 | 316 | 24 | -2 | 51 | 90630 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 451 | 452 | 28 | -2 | 48 | 94132 | 2 | 1.75 | 3 | 89 | 0 | 0 | 0 | 1 | 0 |
| 524 | 525 | 24 | -1 | 75 | 93014 | 4 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 536 | 537 | 25 | -1 | 43 | 92173 | 3 | 2.40 | 2 | 176 | 0 | 0 | 0 | 1 | 0 |
| 540 | 541 | 25 | -1 | 109 | 94010 | 4 | 2.30 | 3 | 314 | 0 | 0 | 0 | 1 | 0 |
| 576 | 577 | 25 | -1 | 48 | 92870 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 583 | 584 | 24 | -1 | 38 | 95045 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 597 | 598 | 24 | -2 | 125 | 92835 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 649 | 650 | 25 | -1 | 82 | 92677 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 670 | 671 | 23 | -1 | 61 | 92374 | 4 | 2.60 | 1 | 239 | 0 | 0 | 0 | 1 | 0 |
| 686 | 687 | 24 | -1 | 38 | 92612 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 793 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 889 | 890 | 24 | -2 | 82 | 91103 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 909 | 910 | 23 | -1 | 149 | 91709 | 1 | 6.33 | 1 | 305 | 0 | 0 | 0 | 0 | 1 |
| 1173 | 1174 | 24 | -1 | 35 | 94305 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1428 | 1429 | 25 | -1 | 21 | 94583 | 4 | 0.40 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 1522 | 1523 | 25 | -1 | 101 | 94720 | 4 | 2.30 | 3 | 256 | 0 | 0 | 0 | 0 | 1 |
| 1905 | 1906 | 25 | -1 | 112 | 92507 | 2 | 2.00 | 1 | 241 | 0 | 0 | 0 | 1 | 0 |
| 2102 | 2103 | 25 | -1 | 81 | 92647 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2430 | 2431 | 23 | -1 | 73 | 92120 | 4 | 2.60 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2466 | 2467 | 24 | -2 | 80 | 94105 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2545 | 2546 | 25 | -1 | 39 | 94720 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2618 | 2619 | 23 | -3 | 55 | 92704 | 3 | 2.40 | 2 | 145 | 0 | 0 | 0 | 1 | 0 |
| 2717 | 2718 | 23 | -2 | 45 | 95422 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2848 | 2849 | 24 | -1 | 78 | 94720 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2876 | 2877 | 24 | -2 | 80 | 91107 | 2 | 1.60 | 3 | 238 | 0 | 0 | 0 | 0 | 0 |
| 2962 | 2963 | 23 | -2 | 81 | 91711 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2980 | 2981 | 25 | -1 | 53 | 94305 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3076 | 3077 | 29 | -1 | 62 | 92672 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3130 | 3131 | 23 | -2 | 82 | 92152 | 2 | 1.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3157 | 3158 | 23 | -1 | 13 | 94720 | 4 | 1.00 | 1 | 84 | 0 | 0 | 0 | 1 | 0 |
| 3279 | 3280 | 26 | -1 | 44 | 94901 | 1 | 2.00 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3284 | 3285 | 25 | -1 | 101 | 95819 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3292 | 3293 | 25 | -1 | 13 | 95616 | 4 | 0.40 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3394 | 3395 | 25 | -1 | 113 | 90089 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3425 | 3426 | 23 | -1 | 12 | 91605 | 4 | 1.00 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 3626 | 3627 | 24 | -3 | 28 | 90089 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3796 | 3797 | 24 | -2 | 50 | 94920 | 3 | 2.40 | 2 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3824 | 3825 | 23 | -1 | 12 | 95064 | 4 | 1.00 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3946 | 3947 | 25 | -1 | 40 | 93117 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4015 | 4016 | 25 | -1 | 139 | 93106 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4088 | 4089 | 29 | -1 | 71 | 94801 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4116 | 4117 | 24 | -2 | 135 | 90065 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4285 | 4286 | 23 | -3 | 149 | 93555 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4411 | 4412 | 23 | -2 | 75 | 90291 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4481 | 4482 | 25 | -2 | 35 | 95045 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4514 | 4515 | 24 | -3 | 41 | 91768 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4582 | 4583 | 25 | -1 | 69 | 92691 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4957 | 4958 | 29 | -1 | 50 | 95842 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
# convert above 52 rows to positive value
df[df['Experience'] < 0] = df[df['Experience'] < 0].abs()
df
| ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91108 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 14 columns
# check missing values
df.isna().sum().to_frame().T
| ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# check invalid valuse
for col in df:
print(f"{col} has {df[col].nunique()} unique value")
ID has 5000 unique value Age has 45 unique value Experience has 44 unique value Income has 162 unique value ZIP Code has 468 unique value Family has 4 unique value CCAvg has 108 unique value Education has 3 unique value Mortgage has 347 unique value Personal Loan has 2 unique value Securities Account has 2 unique value CD Account has 2 unique value Online has 2 unique value CreditCard has 2 unique value
# Check value counts of column that appear categorical accoring to above results
discrete_cols1 = ['Family', 'Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
for col in discrete_cols1:
print(f"{col}:\n{df[col].value_counts()}")
print('-' * 50)
Family: Family 1 1472 2 1296 4 1222 3 1010 Name: count, dtype: int64 -------------------------------------------------- Education: Education 1 2096 3 1501 2 1403 Name: count, dtype: int64 -------------------------------------------------- Personal Loan: Personal Loan 0 4520 1 480 Name: count, dtype: int64 -------------------------------------------------- Securities Account: Securities Account 0 4478 1 522 Name: count, dtype: int64 -------------------------------------------------- CD Account: CD Account 0 4698 1 302 Name: count, dtype: int64 -------------------------------------------------- Online: Online 1 2984 0 2016 Name: count, dtype: int64 -------------------------------------------------- CreditCard: CreditCard 0 3530 1 1470 Name: count, dtype: int64 --------------------------------------------------
# now check duplicated data
df.duplicated().sum()
0
# drop ID and ZIP Code columns(not require for create a model)
df.drop(['ID', 'ZIP Code'], axis=1, inplace=True)
df
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 49 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 34 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 11 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 100 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 45 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 29 | 3 | 40 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 30 | 4 | 15 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 63 | 39 | 24 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 65 | 40 | 49 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 28 | 4 | 83 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 12 columns
# check noisy data
%matplotlib inline
sns.set_palette('summer')
dnp = sns.pairplot(df.loc[:, ~df.columns.isin(discrete_cols1)])
dnp.fig.suptitle('Detect Noisy Data', y=1.02, **FONT)
plt.show()
C:\Users\davoo\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
#check the outliers
import seaborn as sns
import matplotlib.pyplot as plt
# Select the columns
columns_to_plot = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage']
# Create a boxplot
plt.figure(figsize=(12, 8)) # Set the figure size
sns.boxplot(data=df[columns_to_plot], orient='h') # Plot horizontal boxplots
plt.title('Boxplot of Selected Columns') # Set the title of the plot
plt.xlabel('Values') # Set the label for the x-axis
plt.ylabel('Variables') # Set the label for the y-axis
plt.show() # Show the plot
# convert annual income to monthly with divide by 12
df['Income'] = round(df['Income']/12, 2)
df
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 4.08 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 2.83 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 0.92 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 8.33 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 3.75 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 29 | 3 | 3.33 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 30 | 4 | 1.25 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 63 | 39 | 2.00 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 65 | 40 | 4.08 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 28 | 4 | 6.92 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 12 columns
# check distribution Scatter matrix
sns.set_palette('summer')
fig, ax = plt.subplots(4,3,figsize=(12,20))
for i, col in enumerate(df):
sns.histplot(df[col], kde=True, ax=ax[i//3, i%3])
fig.suptitle('Distribution of Columns', y=1.02, **FONT)
plt.show()
# univariate analysis of categorical data:
sns.set_palette("summer_r")
for i, col in enumerate(discrete_cols1):
fig, axes = plt.subplots(1,2,figsize=(10,4))
# count of col (countplot)
sns.countplot(data=df, x=col, ax=axes[0])
for container in axes[0].containers:
axes[0].bar_label(container)
# count of col (pie chart)
slices = df[col].value_counts().sort_index().values
activities = [var for var in df[col].value_counts().sort_index().index]
axes[1].pie(slices, labels=activities, shadow=True, autopct='%1.1f%%')
plt.suptitle(f'Count of Unique Value in {col} (Fig {i+1})', y=1.09, **FONT)
plt.show()
# univariate analysis of numerical data:
df.loc[:, ~df.columns.isin(discrete_cols1)].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 5000.0 | 45.338400 | 11.463166 | 23.00 | 35.00 | 45.00 | 55.00 | 67.00 |
| Experience | 5000.0 | 20.134600 | 11.415189 | 0.00 | 10.00 | 20.00 | 30.00 | 43.00 |
| Income | 5000.0 | 6.147912 | 3.836233 | 0.67 | 3.25 | 5.33 | 8.17 | 18.67 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.00 | 0.70 | 1.50 | 2.50 | 10.00 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.00 | 0.00 | 0.00 | 101.00 | 635.00 |
According to above plots and tabel:
Customers with the number of Family 1 and the number of Family 3 respectively have the highest frequency and the lowest frequency, but in general, the customers with the number of Family 1, 2, 3, and 4 are almost equally distributed in the dataset (Fig 1). The customers who did not accept a Personal Loan are much more than the customers who accepted a Personal Loan, and therefore there is an imbalance in the classes, so we must be careful to consider the imbalance in the model section for resampling the data. Most of the bank's customers have education level 1 i.e. Undergrad (Fig 2). Most of the bank's customers (90.4%) did not accept the personal loan offer (Fig 3). Most of the bank's customers (89.6%) did not have a Securities Account (Fig 4). Most of the bank's customers (94%) did not have a CD Account (Fig 5). Most of the bank's customers (59.7%) used internet banking facilities (Fig 6). Most of the bank's customers (70.6%) did not use a credit card issued by Universal Bank(Fig 6). The Age range of customers is between 23 and 67 years. The average age of customers is almost 45. The Experience range of customers is between 0 and 43 years. The mean age of customers is almost 20. The age and experience columns have a similar distribution. Also, the column of income, mortgage and average distribution are almost similar, all of them are skewed to the right. The average income of the bank's customers per month is approximately 6 thousand dollars and its range is between 0.67 and 18.67 thousand dollars. CCAvg of the bank's customers per month is approximately 1.94 thousand dollars and its range is between 0 and 10 thousand dollars. The average Mortgage of the bank's customers is approximately 56 thousand dollars and its range is between 0 and 635 thousand dollars.
# count of purchased based on Gender
%matplotlib inline
sns.set_palette(['#1f4a1b','orange','#bbff33','yellow'])
discrete_cols2 = ['Family', 'Education', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
for i, col in enumerate(discrete_cols2):
ax = sns.countplot(data=df, x='Personal Loan', hue=col)
for container in ax.containers:
ax.bar_label(container)
plt.title(f'Count of Personal Loan based on {col} (Fig {i+5})', fontdict=FONT, pad=15)
plt.show()
According to above plots:
Among the people who did not accept the personal loan, most of them had a family equal to 1, but among the people who accepted the personal loan, there is not much difference in terms of family (Fig 5). Among the people who did not accept the personal loan, most of them had an Education of 1, but among the people who accepted the personal loan, the Education was mostly 3 or 2 (Fig 6). Most of the people, both those who accepted the personal loan and those who did not, did not have a Securities Account (Fig 7). Most of the people, both those who accepted the personal loan and those who did not, did not have a CD Account (Fig 8). Most of the people, both those who accepted the personal loan and those who did not, used online banking facilities (Fig 9). Most of the people, both those who accepted the personal loan and those who did not, did not use a Creditcard (Fig 10).
# Mean of Income and CCAvg based on each feature
for i, col in enumerate(['Income', 'CCAvg','Mortgage']):
print('='*30, f"Mean of {col} in each categorical feature", '='*30)
for j, cat in enumerate(discrete_cols2):
fig , ax= plt.subplots(1,2, figsize=(10,4))
gp = df.groupby([cat])[col].mean().to_frame().reset_index()
sns.barplot(data=gp, x=cat, y=col, ax=ax[0])
for container in ax[0].containers:
ax[0].bar_label(container)
ax[0].set_title(f'Mean of {col} (based on {cat})', y=1.09, **FONT)
sns.boxplot(data=df, x=cat, y=col, ax=ax[1])
ax[1].set_title(f'Boxplot of {cat} (Fig {i+11}-{j+1})', y=1.09, **FONT)
plt.show()
============================== Mean of Income in each categorical feature ==============================
============================== Mean of CCAvg in each categorical feature ==============================
============================== Mean of Mortgage in each categorical feature ==============================
According to above plots:
Customers whose Family was 2 had the highest average Income (7.02 thousand , Fig 11-2). The average income of customers whose Secutities Account and CreditCard and Online was 1 is the same as that of those was zero(6.1 thousand , Fig 11-4). Similar results can be obtained for the CCAvg and Mortgage average, which shows that the behavior of the CCAvg, Mortgage and Income columns is somewhat similar to each other (Fig 12-1 to 13-6). According to the heatmap, Personal Loan has the highest correlation with Income, CCAvg and CD Account respectively. Age and experience have a completely linear relationship with each other.
# draw heatplot of correlation between columns
plt.figure(figsize=(10,8))
sns.heatmap(round(df.corr(),2), cmap='Greens', annot=True)
plt.title('Heatmap of Correlations', y=1.02, fontdict=FONT)
plt.show()
# draw pairplot with hue = Personal Loan
sns.set_palette(['#1f4a1b','orange','#bbff33','yellow'])
splot = sns.pairplot(data=df, x_vars=['Age','Experience','Income','CCAvg','Mortgage'], y_vars=['Age','Experience','Income','CCAvg','Mortgage'], hue='Personal Loan')
splot.fig.suptitle('Scatter plot of continuous feature (hue = Personal Loan)', y=1.05, **FONT)
plt.show()
C:\Users\davoo\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
results = pd.pivot_table(data=df, index='CD Account', columns='Education', values='Personal Loan')
sns.heatmap(results, cmap='Greens', annot=True)
plt.show()
According to above plots:
Customers whose Income is less than 10 thousand per month and their CCAvg is less than $3 thousand per month have not accepted a personal loan. 62% of customers whose CD Account was 1 and Education was 2 have accepted a personal loan.
fig, ax = plt.subplots(6,2,figsize=(14,24))
sns.stripplot(data=df, x='Education', y='Income', hue='Personal Loan', ax=ax[0,0])
sns.stripplot(data=df, x='Education', y='CCAvg', hue='Personal Loan', ax=ax[0,1])
sns.stripplot(data=df, x='Family', y='Income', hue='Personal Loan', ax=ax[1,0])
sns.stripplot(data=df, x='Family', y='CCAvg', hue='Personal Loan', ax=ax[1,1])
sns.stripplot(data=df, x='CD Account', y='Income', hue='Personal Loan', ax=ax[2,0])
sns.stripplot(data=df, x='CD Account', y='CCAvg', hue='Personal Loan', ax=ax[2,1])
sns.stripplot(data=df, x='Online', y='Income', hue='Personal Loan', ax=ax[3,0])
sns.stripplot(data=df, x='Online', y='CCAvg', hue='Personal Loan', ax=ax[3,1])
sns.stripplot(data=df, x='CreditCard', y='Income', hue='Personal Loan', ax=ax[4,0])
sns.stripplot(data=df, x='CreditCard', y='CCAvg', hue='Personal Loan', ax=ax[4,1])
sns.stripplot(data=df, x='Securities Account', y='Income', hue='Personal Loan', ax=ax[5,0])
sns.stripplot(data=df, x='Securities Account', y='CCAvg', hue='Personal Loan', ax=ax[5,1])
ax[0,0].set_title('Stripplot of Personal Loan vs Income',y=1.05, **FONT)
ax[0,1].set_title('Stripplot of Personal Loan vs CCAvg',y=1.05, **FONT)
plt.tight_layout()
plt.show()
According to above plots:
All customers with of more than5 thousand and with Education level 2 or 3, accepted Personal Loans. All customers with Income of more than 5 thousand and by Family 3 or 4, accepted Personal Loans. Most customers with Income of more than 5 thousand and by CD Account 1, accepted Personal Loans.
# define x and y
x = df.drop('Personal Loan', axis=1)
y = df['Personal Loan'].values.reshape(-1,1)
Model = []
FPR = []
TPR = []
ACC_test = []
ACC_train = []
Recall = []
Precision = []
F1 = []
AUC = []
def delete_results():
"""Delete results of Previous models for preveing to avoid congestion in ROC charts"""
global FPR, TPR, ACC_test, ACC_train, Recall, Precision, F1, AUC
del FPR[:]
del TPR[:]
del ACC_test[:]
del ACC_train[:]
del Recall[:]
del Precision[:]
del F1[:]
del AUC[:]
def plot_confusion_matrix2(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function plots the confusion matrix.
cm(array): confusion matrix
classes(dictionary): classes of our target (key=categorical type, value=numerical type)
"""
plt.figure(figsize=(10,7))
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, [f"{value}={key}" for key , value in classes.items()], rotation=45)
plt.yticks(tick_marks, [f"{value}={key}" for key , value in classes.items()])
names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
thresh = cm.max() / 2.
for k, (i,j) in enumerate(itertools.product(range(cm.shape[0]), range(cm.shape[1]))):
plt.text(j, i, f"{names[k]}\n{cm[i,j]}\n{cm[i,j]/np.sum(cm)*100:.2f}%",
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()
def Perform_cross_val(model, k, x, y, scoring):
"""
perform cross validation
model: model
k(scaler): the value for n_splits in KFold()
x(DataFrame or array): x_train
y(DataFrame or array): y_train
scoring(string): an approach for evaluation in cross validation
"""
kf = StratifiedKFold(n_splits=k)
cv_results = cross_val_score(model, x, y.ravel(), cv=kf, scoring=scoring)
cv_mean = np.mean(cv_results)
print('-'*20, f"CV for k={k}, scoring={scoring}", '-'*20)
print(f"CV mean: {cv_mean}")
print(f"CV results: {cv_results}\n")
def find_fold_index(k, x):
"""
Find fold index in kfold
k(scaler): the value used for n_splits in KFold()
x(DataFrame or array): x_train
"""
my_fold_index = []
j=1
for _ , test in KFold(k).split(x):
my_fold_index = []
for i in test:
my_fold_index.append(i)
print(f"fold {j}: [{my_fold_index[0]},{my_fold_index[-1]}]")
print(20*'-')
j += 1
def change_test_size(model, x, y, name):
# try to imporve model by changing test_size
test_sizes= [0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
acc_table = pd.DataFrame(columns=['Model', 'test_size', 'ACC_train', 'ACC_test', 'Recall_train', 'Recall_test'])
for i, test_size in enumerate(test_sizes):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0, stratify=y)
model.fit(x_train, y_train.ravel())
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
acc_test_i = accuracy_score(y_test, y_pred_test)
acc_train_i = accuracy_score(y_train, y_pred_train)
rec_test_i = recall_score(y_test, y_pred_test)
rec_train_i = recall_score(y_train, y_pred_train)
acc_table.loc[len(acc_table.index)] = [f"{name} {i+1}", str(test_size), acc_train_i, acc_test_i, rec_train_i, rec_test_i]
return acc_table.sort_values(by=['Recall_test'], ascending=False).style.background_gradient(cmap='summer_r')
def plot_results(FPR, TPR, AUC, ACC_test, ACC_train, Recall, Precision, F1, y_proba_test, y_test, model_name, Model):
"""
draw ROC curve and plot of Recall, precision, f1 score etc.
FPR(list): list of False Positive Rate
TPR(list): list of True Positive Rate
ACC(list): list of accuracy of models
Recall(list): list of recall score of models
Precision(list): list of Precision score of models
F1(list): list of F1 score of models
classes(dictionary): classes of our target (key=categorical type, value=numerical type)
"""
fig1 = go.Figure()
fig2 = go.Figure()
# the green line represents where TPR = FPR
fig1.add_shape(type='line', line=dict(color='green', dash='dash'),x0=0, x1=1, y0=0, y1=1)
for fpr_i, tpr_i, auc_i, name in zip(FPR, TPR, AUC, Model):
# ROC Curve
fig1.add_trace(go.Scatter(x=fpr_i, y=tpr_i, name=f"{name} AUC = {auc_i:.4f}", mode='lines'))
# the histogram of scores compared to true labels
fig_hist = px.histogram(x=y_proba_test[:,1], color=y_test.ravel(), nbins=50, labels=dict(color='Personal Loan', x='Probability'))
fig2.add_trace(fig_hist.data[0])
fig2.add_trace(fig_hist.data[1])
# Reduce opacity to see both histograms
fig2.update_traces(opacity=0.75)
# Accuracy plot
fig3 = make_subplots(rows=1, cols=2)
fig3.add_trace(go.Scatter(y=ACC_test, mode='lines+markers', name='ACC test', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=1)
fig3.add_trace(go.Scatter(y=Recall, mode='lines+markers', name='Recall', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=1)
fig3.add_trace(go.Scatter(y=Precision, mode='lines+markers', name='Precision', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=1)
fig3.add_trace(go.Scatter(y=F1, mode='lines+markers', name='F1 score', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=1)
fig3.add_trace(go.Scatter(y=ACC_train, mode='lines+markers', name='ACC train', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=2)
fig3.add_trace(go.Scatter(y=ACC_test, mode='lines+markers', name='ACC test', hovertemplate="<b>%{text}</b><br>" +"(%{x},%{y})", text=Model), row=1, col=2)
# update layout and show figs
fig1.update_layout(
title= 'ROC curve and AUC score',
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
yaxis=dict(scaleanchor="x", scaleratio=1),
xaxis=dict(constrain='domain'),
width=700, height=500,
showlegend=True)
fig2.update_layout(
# showlegend=True,
barmode='overlay', # Overlay both histograms
title='Interpret ROC curve by histogram',
xaxis_title='Probability',
yaxis_title='Count')
fig3.update_layout(
showlegend=True,
title='Model Evaluation & Train and Test Accuracy)',
xaxis_title='Model',
yaxis_title='Evaluation measure')
# Set custom x-axis labels
fig3.update_xaxes(ticktext=list(range(1,20)))
fig1.show()
fig2.show()
fig3.show()
def modeling(clf, x, y, test_size, classes, model_name, stratify=False):
# split data to train and test
if stratify:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0, stratify=y)
else:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)
print(20*'-', 'Shape', 20*'-')
print(f"x_train: {x_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_test: {y_test.shape}")
classes1 = np.unique(y_test)
total = len(y_test)
print(15*'-', 'Class Distribution in y_test', 15*'-')
for c in classes1:
n_examples = len(y_test[y_test==c])
percent = n_examples / total * 100
print(f"> Class={c:d} : {n_examples:d}/{total:d} ({percent:.1f}%)")
classes1 = np.unique(y_train)
total = len(y_train)
print(15*'-', 'Class Distribution in y_train', 15*'-')
for c in classes1:
n_examples = len(y_train[y_train==c])
percent = n_examples / total * 100
print(f"> Class={c:d} : {n_examples:d}/{total:d} ({percent:.1f}%)")
# define model and fit model
clf.fit(x_train, y_train.ravel())
# prediction and results
y_pred_train = clf.predict(x_train)
y_pred_test = clf.predict(x_test)
y_proba_train = clf.predict_proba(x_train)
y_proba_test = clf.predict_proba(x_test)
fpr, tpr, _ = roc_curve(y_test, y_proba_test[:,1])
roc_auc = auc(fpr, tpr)
cm = confusion_matrix(y_test, y_pred_test)
acc_test = accuracy_score(y_test, y_pred_test)
acc_train = accuracy_score(y_train, y_pred_train)
rec_test = recall_score(y_test, y_pred_test)
rec_train = recall_score(y_train, y_pred_train)
pre = precision_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
# append results
Model.append(model_name)
FPR.append(fpr)
TPR.append(tpr)
ACC_test.append(acc_test)
ACC_train.append(acc_train)
Recall.append(rec_test)
Precision.append(pre)
F1.append(f1)
AUC.append(roc_auc)
plot_results(FPR, TPR, AUC, ACC_test, ACC_train, Recall, Precision, F1, y_proba_test, y_test, model_name, Model)
# Evaluation model
print('-'*20 , 'Confusion Matrix', '-'*20)
print(cm)
plot_confusion_matrix2(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues)
# or use plot_confusion_matrix from sklearn.metrics
print('-'*20 , 'Classification Report', '-'*20)
print(classification_report(y_test, y_pred_test, ), '\n')
print(f"Jaccard Score: {jaccard_score(y_test, y_pred_test)}")
print(f"Log loss: {log_loss(y_test, y_pred_test)}", '\n')
# print other result about predicted data
return acc_test, acc_train, rec_test, rec_train
# split train and test data by inital test_size=0.2
# stratify used for considering class distribution in spliting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
# Check cross validation on LogisticRegression model to estimate model performance (Accuracy)
logreg = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='newton-cg'))])
Perform_cross_val(logreg, k=10, x=x_train, y=y_train, scoring='accuracy')
-------------------- CV for k=10, scoring=accuracy -------------------- CV mean: 0.9512499999999999 CV results: [0.9475 0.9525 0.9525 0.945 0.955 0.9525 0.9475 0.95 0.96 0.95 ]
# Check cross validation on LogisticRegression model to estimate model performance (Recall)
Perform_cross_val(logreg, k=10, x=x_train, y=y_train, scoring='recall')
-------------------- CV for k=10, scoring=recall -------------------- CV mean: 0.6303643724696357 CV results: [0.60526316 0.63157895 0.68421053 0.63157895 0.68421053 0.60526316 0.58974359 0.58974359 0.66666667 0.61538462]
# create initial model with LogisticRegression and test_size=0.2
acc_test_5_1, acc_train_5_1, rec_test_5_1, rec_train_5_1 = modeling(
clf=logreg,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='LogisticReg 1',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[892 12] [ 36 60]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.96 0.99 0.97 904
1 0.83 0.62 0.71 96
accuracy 0.95 1000
macro avg 0.90 0.81 0.84 1000
weighted avg 0.95 0.95 0.95 1000
Jaccard Score: 0.5555555555555556
Log loss: 1.7300953626776234
print(f"Recall train: {rec_train_5_1}")
print(f"Recall test: {rec_test_5_1}")
Recall train: 0.6328125 Recall test: 0.625
The accuracy and recall obtained is the same as we expected. To improve models, other test size values are also checked:
# try to imporve model by changing test_size
pipe5_1 = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())])
change_test_size(pipe5_1, x, y, 'LogisticReg')
| Model | test_size | ACC_train | ACC_test | Recall_train | Recall_test | |
|---|---|---|---|---|---|---|
| 0 | LogisticReg 1 | 0.2 | 0.952500 | 0.952000 | 0.632812 | 0.625000 |
| 2 | LogisticReg 3 | 0.3 | 0.952000 | 0.949333 | 0.636905 | 0.618056 |
| 1 | LogisticReg 2 | 0.25 | 0.954133 | 0.947200 | 0.650000 | 0.608333 |
| 5 | LogisticReg 6 | 0.45 | 0.956000 | 0.948444 | 0.655303 | 0.597222 |
| 4 | LogisticReg 5 | 0.4 | 0.954000 | 0.948500 | 0.649306 | 0.593750 |
| 3 | LogisticReg 4 | 0.35 | 0.953846 | 0.945143 | 0.657051 | 0.583333 |
# find best parameters for Logistic Regression estimator
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['none','l2', 'l1']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='recall',error_score=0)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
scaler = StandardScaler().fit(x_train)
x_norm_train = scaler.transform(x_train)
x_norm_test = scaler.transform(x_test)
grid_result = grid_search.fit(x_norm_train, y_train.ravel())
# summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print(f"{mean} ({stdev}) with: {param}")
Best: 0.6251012145748989 using {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'none', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'none', 'solver': 'lbfgs'}
0.0 (0.0) with: {'C': 100, 'penalty': 'none', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'none', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'none', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l2', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
0.0 (0.0) with: {'C': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 100, 'penalty': 'l1', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'none', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'none', 'solver': 'lbfgs'}
0.0 (0.0) with: {'C': 10, 'penalty': 'none', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'none', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'none', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l2', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
0.0 (0.0) with: {'C': 10, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 10, 'penalty': 'l1', 'solver': 'lbfgs'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 10, 'penalty': 'l1', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 1.0, 'penalty': 'none', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 1.0, 'penalty': 'none', 'solver': 'lbfgs'}
0.0 (0.0) with: {'C': 1.0, 'penalty': 'none', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 1.0, 'penalty': 'none', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 1.0, 'penalty': 'none', 'solver': 'saga'}
0.6224696356275303 (0.04766742047588906) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.6224696356275303 (0.04766742047588906) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.6224696356275303 (0.04766742047588906) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.6224696356275303 (0.04766742047588906) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'sag'}
0.6224696356275303 (0.04766742047588906) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}
0.0 (0.0) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'lbfgs'}
0.6251012145748989 (0.04588513136503678) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'sag'}
0.6251012145748989 (0.04588513136503678) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.1, 'penalty': 'none', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.1, 'penalty': 'none', 'solver': 'lbfgs'}
0.0 (0.0) with: {'C': 0.1, 'penalty': 'none', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.1, 'penalty': 'none', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.1, 'penalty': 'none', 'solver': 'saga'}
0.5860998650472335 (0.055123868959971484) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.5860998650472335 (0.055123868959971484) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.5886639676113361 (0.05075928590715263) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.5860998650472335 (0.055123868959971484) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'sag'}
0.5860998650472335 (0.055123868959971484) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
0.0 (0.0) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'lbfgs'}
0.5885964912280701 (0.04712929518708335) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'sag'}
0.5912280701754387 (0.050097113569621966) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.01, 'penalty': 'none', 'solver': 'newton-cg'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.01, 'penalty': 'none', 'solver': 'lbfgs'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'none', 'solver': 'liblinear'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.01, 'penalty': 'none', 'solver': 'sag'}
0.6250337381916329 (0.04658905893093731) with: {'C': 0.01, 'penalty': 'none', 'solver': 'saga'}
0.42435897435897435 (0.040260789310156586) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
0.42435897435897435 (0.040260789310156586) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.4841430499325236 (0.04559152076743) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.42435897435897435 (0.040260789310156586) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}
0.42435897435897435 (0.040260789310156586) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'lbfgs'}
0.3620782726045884 (0.051478952908446944) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'sag'}
0.3673414304993252 (0.03291187829790098) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
200 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 73, in _check_solver
raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear solver
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.
You may see some warnings during the optimization for invalid configuration combinations. These can be safely ignored. The results are summarized as follows:
Best: 0.6251012145748989 using {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
# create better LogisticRegression model
logreg2 = LogisticRegression(solver='liblinear', penalty='l1', C=1, n_jobs=-1)
pipe5_2 = Pipeline([('scaler', StandardScaler()), ('clf', logreg2)])
acc_test_5_2, acc_train_5_2, rec_test_5_2, rec_train_5_2 = modeling(
clf=pipe5_2,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='LogisticReg 2',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.
-------------------- Confusion Matrix -------------------- [[892 12] [ 35 61]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.96 0.99 0.97 904
1 0.84 0.64 0.72 96
accuracy 0.95 1000
macro avg 0.90 0.81 0.85 1000
weighted avg 0.95 0.95 0.95 1000
Jaccard Score: 0.5648148148148148
Log loss: 1.6940517092885061
print(f"Recall train: {rec_train_5_2}")
print(f"Recall test: {rec_test_5_2}")
Recall train: 0.6328125 Recall test: 0.6354166666666666
# check improve model by change test_size
change_test_size(pipe5_2, x, y, 'LogisticReg')
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12. C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12. C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12. C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12. C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12. C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:1222: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 12.
| Model | test_size | ACC_train | ACC_test | Recall_train | Recall_test | |
|---|---|---|---|---|---|---|
| 0 | LogisticReg 1 | 0.2 | 0.952250 | 0.953000 | 0.632812 | 0.635417 |
| 2 | LogisticReg 3 | 0.3 | 0.951714 | 0.950000 | 0.636905 | 0.625000 |
| 5 | LogisticReg 6 | 0.45 | 0.956364 | 0.948444 | 0.659091 | 0.606481 |
| 1 | LogisticReg 2 | 0.25 | 0.954133 | 0.946400 | 0.650000 | 0.600000 |
| 4 | LogisticReg 5 | 0.4 | 0.954333 | 0.948500 | 0.649306 | 0.593750 |
| 3 | LogisticReg 4 | 0.35 | 0.953846 | 0.945714 | 0.657051 | 0.583333 |
So best model by LogisticRegression algorithm is the model by test_size = 0.2:
def knn_model(x, y, Ks, test_size, show_plot=1, stratify=True):
"""fit knn algorithm, predict x_test and draw plots if you want
x (DataFrame or array): features
y (DataFrame or array): target
test_size (float): parameter that use for split data to train and test set
show_plot (1 or any): for draw plots
Retrun Accuracy_train and Accuracy_test"""
# split dataset
if stratify:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0, stratify=y)
else:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)
print('-'*20, 'Shape', '-'*20)
print ('Train set:', x_train.shape, y_train.shape)
print ('Test set:', x_test.shape, y_test.shape, '\n')
# initial values and constants
Ks = Ks
ACC_train = np.zeros((Ks))
ACC_test = np.zeros((Ks))
REC_train = np.zeros((Ks))
REC_test = np.zeros((Ks))
# for loop for find best k
for k in range(1,Ks+1):
# train model and predict
pipe = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier(n_neighbors=k))])
pipe.fit(x_train, y_train.ravel())
y_pred_train = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)
ACC_train[k-1] = accuracy_score(y_train, y_pred_train)
ACC_test[k-1] = accuracy_score(y_test, y_pred_test)
REC_train[k-1] = recall_score(y_train, y_pred_train)
REC_test[k-1] = recall_score(y_test, y_pred_test)
# draw plots
if show_plot == 1:
x = list(range(1,Ks+1))
x_rev = x[::-1]
# Train Accuracy line
y1 = ACC_train
# Test Accuracy line
y2 = ACC_test
# Train Recall line
y3 = REC_train
# Test Recall line
y4 = REC_test
fig = make_subplots(rows=1, cols=2, subplot_titles=("Accuracy", "Recall"))
# Train Accuracy plot (in 1st subplot)
fig.add_trace(go.Scatter(
x=x, y=y1,
line_color='rgb(0,100,80)',
name='Train Accuracy',
), row=1, col=1)
# Test Accuracy plot (in 1st subplot)
fig.add_trace(go.Scatter(
x=x, y=y2,
line_color='rgb(255,140,0)',
name='Test Accuracy',
), row=1, col=1)
# # Train Recall plot (in 2nd subplot)
fig.add_trace(go.Scatter(
x=x, y=y3,
line_color='rgb(212, 31, 13)',
name='Train Recall',
), row=1, col=2)
# Test Recall plot (in 2nd subplot)
fig.add_trace(go.Scatter(
x=x, y=y4,
line_color='rgb(13, 109, 212)',
name='Test Recall',
), row=1, col=2)
# Update xaxis properties
fig.update_xaxes(title_text="Number of Neighbors (k)", row=1, col=1)
fig.update_xaxes(title_text="Number of Neighbors (k)", row=1, col=2)
# Update yaxis properties
fig.update_yaxes(title_text="Accuracy", row=1, col=1)
fig.update_yaxes(title_text="Recall", row=1, col=2)
fig.update_traces(mode='lines')
fig.update_layout(title_text="Accuracy and Recall of KNN models for all k")
fig.show()
# print results
print( f"The best train accuracy was {ACC_train.max()} with {ACC_train.argmax()+1}")
print( f"The best test accuracy was {ACC_test.max()} with {ACC_test.argmax()+1}")
print( f"The best train recall was {REC_train.max()} with {REC_train.argmax()+1}")
print( f"The best test recall was {REC_test.max()} with {REC_test.argmax()+1}")
return ACC_test, ACC_train, REC_test, REC_train
# split train and test data by inital test_size=0.2
# stratify used for considering class distribution in spliting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
# Check cross validation on KNN model to estimate model performance (Accuracy)
operations = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
pipe6_1 = Pipeline(operations)
Perform_cross_val(pipe6_1, k=10, x=x_train, y=y_train, scoring='accuracy')
-------------------- CV for k=10, scoring=accuracy -------------------- CV mean: 0.9574999999999999 CV results: [0.9425 0.965 0.9675 0.955 0.9575 0.96 0.965 0.95 0.955 0.9575]
# Check cross validation on KNN model to estimate model performance (Recall)
Perform_cross_val(pipe6_1, k=10, x=x_train, y=y_train, scoring='recall')
-------------------- CV for k=10, scoring=recall -------------------- CV mean: 0.5988529014844804 CV results: [0.44736842 0.63157895 0.68421053 0.60526316 0.60526316 0.57894737 0.66666667 0.56410256 0.58974359 0.61538462]
According to the obtained result, we expect our KNN model to have an accuracy close to or greater than 0.957. Let's check it:
# find best k for knn model
acc_test_6_1, acc_train_6_1, rec_test_6_1, rec_train_6_1 = knn_model(x, y, 30, 0.2, show_plot=1, stratify=True)
-------------------- Shape -------------------- Train set: (4000, 11) (4000, 1) Test set: (1000, 11) (1000, 1)
The best train accuracy was 1.0 with 1 The best test accuracy was 0.968 with 3 The best train recall was 1.0 with 1 The best test recall was 0.7395833333333334 with 1
# check improve model by change test_size
change_test_size(pipe6_1, x, y, 'KNN')
| Model | test_size | ACC_train | ACC_test | Recall_train | Recall_test | |
|---|---|---|---|---|---|---|
| 0 | KNN 1 | 0.2 | 0.971500 | 0.968000 | 0.723958 | 0.697917 |
| 2 | KNN 3 | 0.3 | 0.968857 | 0.962000 | 0.702381 | 0.645833 |
| 1 | KNN 2 | 0.25 | 0.972000 | 0.960800 | 0.730556 | 0.641667 |
| 4 | KNN 5 | 0.4 | 0.964333 | 0.960000 | 0.659722 | 0.625000 |
| 3 | KNN 4 | 0.35 | 0.965846 | 0.959429 | 0.673077 | 0.619048 |
| 5 | KNN 6 | 0.45 | 0.964364 | 0.957778 | 0.655303 | 0.615741 |
The best value of k for the accuracy of the model is 3, but recall is important for us and for this, k is 1, but this value, as is known, causes the model to overfit. The best k for recall that performs best after k=1 and does not cause the model to overfit is k=3. (Because it is a classification problem, it is better to use even k)
# create better model on KNN algorithm by k=3
knn = KNeighborsClassifier(n_neighbors=3)
pipe6_2 = Pipeline([('scaler', StandardScaler()), ('clf', knn)])
acc_test_6_1, acc_train_6_1, rec_test_6_1, rec_train_6_1 = modeling(
clf=pipe6_2,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='KNN1',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[899 5] [ 27 69]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.97 0.99 0.98 904
1 0.93 0.72 0.81 96
accuracy 0.97 1000
macro avg 0.95 0.86 0.90 1000
weighted avg 0.97 0.97 0.97 1000
Jaccard Score: 0.6831683168316832
Log loss: 1.1533969084517488
print(f"Recall train: {rec_train_6_1}")
print(f"Recall test: {rec_test_6_1}")
Recall train: 0.78125 Recall test: 0.71875
# knn parameter tuning with gridsearch
kValues = list(range(3, 31, 2))
weights = ['uniform','distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
p = [1,2]
param_grid = dict(knn__n_neighbors=kValues, knn__weights=weights, knn__algorithm=algorithm, knn__p=p)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
classifier = GridSearchCV(pipe6_1, param_grid, cv=10, scoring='recall')
classifier.fit(x_train, y_train.ravel())
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19,
21, 23, 25, 27, 29],
'knn__p': [1, 2],
'knn__weights': ['uniform', 'distance']},
scoring='recall')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]),
param_grid={'knn__algorithm': ['auto', 'ball_tree', 'kd_tree',
'brute'],
'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19,
21, 23, 25, 27, 29],
'knn__p': [1, 2],
'knn__weights': ['uniform', 'distance']},
scoring='recall')Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])StandardScaler()
KNeighborsClassifier()
results1 = classifier.best_estimator_.get_params()
df_results1 = pd.DataFrame(results1).T.rename(columns={0: "Values1", 1: "Values2"})
df_results1
| Values1 | Values2 | |
|---|---|---|
| memory | None | None |
| steps | (scaler, StandardScaler()) | (knn, KNeighborsClassifier(n_neighbors=3, weig... |
| verbose | False | False |
| scaler | StandardScaler() | StandardScaler() |
| knn | KNeighborsClassifier(n_neighbors=3, weights='d... | KNeighborsClassifier(n_neighbors=3, weights='d... |
| scaler__copy | True | True |
| scaler__with_mean | True | True |
| scaler__with_std | True | True |
| knn__algorithm | auto | auto |
| knn__leaf_size | 30 | 30 |
| knn__metric | minkowski | minkowski |
| knn__metric_params | None | None |
| knn__n_jobs | None | None |
| knn__n_neighbors | 3 | 3 |
| knn__p | 2 | 2 |
| knn__weights | distance | distance |
df_results2 = pd.DataFrame(classifier.cv_results_)
df_results2
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_knn__algorithm | param_knn__n_neighbors | param_knn__p | param_knn__weights | params | split0_test_score | ... | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.031911 | 0.002544 | 0.084984 | 0.007177 | auto | 3 | 1 | uniform | {'knn__algorithm': 'auto', 'knn__n_neighbors':... | 0.473684 | ... | 0.684211 | 0.605263 | 0.578947 | 0.641026 | 0.589744 | 0.641026 | 0.641026 | 0.611808 | 0.055369 | 17 |
| 1 | 0.028618 | 0.005496 | 0.049518 | 0.004723 | auto | 3 | 1 | distance | {'knn__algorithm': 'auto', 'knn__n_neighbors':... | 0.473684 | ... | 0.684211 | 0.605263 | 0.578947 | 0.641026 | 0.589744 | 0.641026 | 0.666667 | 0.614372 | 0.057226 | 13 |
| 2 | 0.035344 | 0.014913 | 0.085153 | 0.022657 | auto | 3 | 2 | uniform | {'knn__algorithm': 'auto', 'knn__n_neighbors':... | 0.578947 | ... | 0.684211 | 0.631579 | 0.578947 | 0.692308 | 0.615385 | 0.717949 | 0.641026 | 0.650877 | 0.047969 | 5 |
| 3 | 0.028072 | 0.007649 | 0.042945 | 0.011263 | auto | 3 | 2 | distance | {'knn__algorithm': 'auto', 'knn__n_neighbors':... | 0.578947 | ... | 0.684211 | 0.631579 | 0.578947 | 0.692308 | 0.615385 | 0.717949 | 0.666667 | 0.656073 | 0.051696 | 1 |
| 4 | 0.029854 | 0.008730 | 0.094246 | 0.013731 | auto | 5 | 1 | uniform | {'knn__algorithm': 'auto', 'knn__n_neighbors':... | 0.394737 | ... | 0.631579 | 0.552632 | 0.526316 | 0.615385 | 0.538462 | 0.615385 | 0.589744 | 0.575371 | 0.078589 | 33 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 219 | 0.016257 | 0.006381 | 0.155059 | 0.010013 | brute | 27 | 2 | distance | {'knn__algorithm': 'brute', 'knn__n_neighbors'... | 0.263158 | ... | 0.473684 | 0.447368 | 0.421053 | 0.487179 | 0.461538 | 0.487179 | 0.512821 | 0.439609 | 0.067187 | 161 |
| 220 | 0.018808 | 0.016605 | 0.215499 | 0.013972 | brute | 29 | 1 | uniform | {'knn__algorithm': 'brute', 'knn__n_neighbors'... | 0.236842 | ... | 0.394737 | 0.315789 | 0.368421 | 0.487179 | 0.461538 | 0.410256 | 0.435897 | 0.390013 | 0.068247 | 221 |
| 221 | 0.012483 | 0.006279 | 0.171575 | 0.014739 | brute | 29 | 1 | distance | {'knn__algorithm': 'brute', 'knn__n_neighbors'... | 0.263158 | ... | 0.421053 | 0.394737 | 0.421053 | 0.512821 | 0.461538 | 0.487179 | 0.435897 | 0.426586 | 0.063768 | 181 |
| 222 | 0.011126 | 0.006944 | 0.218426 | 0.028468 | brute | 29 | 2 | uniform | {'knn__algorithm': 'brute', 'knn__n_neighbors'... | 0.236842 | ... | 0.447368 | 0.394737 | 0.368421 | 0.487179 | 0.487179 | 0.384615 | 0.512821 | 0.408232 | 0.076421 | 205 |
| 223 | 0.017670 | 0.003351 | 0.161670 | 0.013735 | brute | 29 | 2 | distance | {'knn__algorithm': 'brute', 'knn__n_neighbors'... | 0.263158 | ... | 0.473684 | 0.447368 | 0.394737 | 0.487179 | 0.487179 | 0.487179 | 0.512821 | 0.436910 | 0.069860 | 169 |
224 rows × 22 columns
In the above tables, the best model is a model with n_neighbors = 3, P = 2, algorithm = auto and weights = distance that is a default model.
So best model by KNN algorithm is the model by test_size = 0.2 and k=3:
df
| Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 4.08 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 2.83 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 0.92 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 8.33 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 3.75 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4995 | 29 | 3 | 3.33 | 1 | 1.9 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4996 | 30 | 4 | 1.25 | 4 | 0.4 | 1 | 85 | 0 | 0 | 0 | 1 | 0 |
| 4997 | 63 | 39 | 2.00 | 2 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4998 | 65 | 40 | 4.08 | 3 | 0.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4999 | 28 | 4 | 6.92 | 3 | 0.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5000 rows × 12 columns
# Check cross validation on DT model to estimate model performance (Accuracy)
operations = [('DTs', DecisionTreeClassifier(max_depth=3))]
pipe7_1 = Pipeline(operations)
Perform_cross_val(pipe7_1, k=10, x=x_train, y=y_train, scoring='accuracy')
-------------------- CV for k=10, scoring=accuracy -------------------- CV mean: 0.9814999999999999 CV results: [0.9725 0.9875 0.9775 0.9825 0.98 0.9775 0.985 0.9825 0.985 0.985 ]
# Check cross validation on DT model to estimate model performance (Recall)
Perform_cross_val(pipe7_1, k=10, x=x_train, y=y_train, scoring='recall')
-------------------- CV for k=10, scoring=recall -------------------- CV mean: 0.8383265856950066 CV results: [0.78947368 0.86842105 0.84210526 0.86842105 0.78947368 0.78947368 0.87179487 0.82051282 0.84615385 0.8974359 ]
Based on the results obtained above, we expect the model to have an accuracy of around 98% and its recall to be high. Let's check it:
# create initial DTs model without pruning
dts = DecisionTreeClassifier(random_state=0)
pipe7_1 = Pipeline([('scaler', StandardScaler()), ('clf', dts)])
acc_test_7_1, acc_train_7_1, rec_test_7_1, rec_train_7_1 = modeling(
clf=pipe7_1,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='DT 1',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[900 4] [ 10 86]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.99 1.00 0.99 904
1 0.96 0.90 0.92 96
accuracy 0.99 1000
macro avg 0.97 0.95 0.96 1000
weighted avg 0.99 0.99 0.99 1000
Jaccard Score: 0.86
Log loss: 0.5046111474476402
print(f"Train Accuracy: {acc_train_7_1}")
print(f"Test Accuracy: {acc_test_7_1}")
print(f"Train Recall: {rec_train_7_1}")
print(f"Test Recall: {rec_test_7_1}")
Train Accuracy: 1.0 Test Accuracy: 0.986 Train Recall: 1.0 Test Recall: 0.8958333333333334
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train,y_train.ravel())
y_train_predicted=clf.predict(x_train)
y_test_predicted=clf.predict(x_test)
print(f"Train Accuracy: {accuracy_score(y_train,y_train_predicted)}")
print(f"Test Accuracy: {accuracy_score(y_test,y_test_predicted)}")
print(f"Train Recall: {recall_score(y_train,y_train_predicted)}")
print(f"Test Recall: {recall_score(y_test,y_test_predicted)}")
Train Accuracy: 1.0 Test Accuracy: 0.986 Train Recall: 1.0 Test Recall: 0.8958333333333334
# Grid of parameters to choose from
grid_param={"criterion":["gini","entropy"],
"max_depth":range(2,10,1),
"min_samples_leaf":range(1,15,1),
"min_samples_split":range(2,20,1)
}
cv = StratifiedKFold(n_splits=10)
grid_search=GridSearchCV(estimator=dt1,param_grid=grid_param,cv=cv,n_jobs=-1)
grid_search.fit(x_train,y_train.ravel())
GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': range(2, 10),
'min_samples_leaf': range(1, 15),
'min_samples_split': range(2, 20)})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': range(2, 10),
'min_samples_leaf': range(1, 15),
'min_samples_split': range(2, 20)})DecisionTreeClassifier(random_state=0)
DecisionTreeClassifier(random_state=0)
print(grid_search.best_params_)
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 14, 'min_samples_split': 2}
# check Accuracy, Recanll and overfitting
y_predicted_train= dt2.predict(x_train)
y_predicted_test = dt2.predict(x_test)
print(f"Train Accuracy: {accuracy_score(y_train, y_predicted_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_predicted_test)}")
print(f"Train Recall: {recall_score(y_train, y_predicted_train)}")
print(f"Test Recall: {recall_score(y_test, y_predicted_test)}")
Train Accuracy: 0.958 Test Accuracy: 0.96 Train Recall: 0.734375 Test Recall: 0.7604166666666666
Our model is not overfit and the obtained results show high accuracy and recal
importances = dt2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8,8))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='green', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
cm = confusion_matrix(y_test, y_predicted_test)
plot_confusion_matrix2(cm=cm, classes={'Not Accepted':0, 'Accepted':1}, )
print(classification_report(y_test, y_predicted_test))
precision recall f1-score support
0 0.97 0.98 0.98 904
1 0.81 0.76 0.78 96
accuracy 0.96 1000
macro avg 0.89 0.87 0.88 1000
weighted avg 0.96 0.96 0.96 1000
According to the figure above, as seen in the EDA section, the two features Education and Income have the most effect and importance.
Important feature is Education, Income, Family, CCAvg, Securities Account, Online, Personal Loan
Obtained recall is good ذut the model still incorrectly predicted 18 people who accepted the loan. Let's see if this value can be reduced or not:
Now we use post-pruning:
path = dt1.cost_complexity_pruning_path(x_train, y_train)
#path variable gives two things ccp_alphas and impurities
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print("ccp alpha wil give list of values :",ccp_alphas)
print("\n")
print("Impurities in Decision Tree :",impurities)
ccp alpha wil give list of values : [0. 0.00016461 0.0002 0.0002 0.00024052 0.00024159 0.00025 0.00030303 0.00030909 0.00033333 0.00035714 0.00037255 0.00038571 0.00038889 0.0004 0.00040378 0.00040909 0.00041667 0.0004375 0.00045714 0.0004836 0.00049744 0.0005142 0.00052256 0.00056643 0.00057626 0.0006499 0.00071739 0.00082051 0.00286805 0.00296292 0.00312239 0.00654547 0.02407816 0.05475665] Impurities in Decision Tree : [0. 0.00049383 0.00089383 0.00129383 0.00273696 0.00322015 0.00347015 0.00437924 0.00623379 0.00656712 0.0072814 0.00839905 0.00878477 0.00956254 0.00996254 0.01036632 0.01118451 0.01160117 0.01203867 0.01249582 0.01394661 0.01444405 0.01598664 0.01703177 0.0175982 0.01932699 0.01997689 0.02069428 0.02151479 0.02438284 0.03030869 0.03343108 0.03997655 0.0640547 0.173568 ]
# train a decision tree using the effective alphas
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf.fit(x_train, y_train.ravel())
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.054756649204188663
# Visualizing the Recall score for train and test set:
train_scores = [recall_score(y_train, clf.predict(x_train)) for clf in clfs]
test_scores = [recall_score(y_test, clf.predict(x_test)) for clf in clfs]
fig = go.Figure()
fig.add_trace(go.Scatter(x=ccp_alphas, y=train_scores, name='Train Recall', mode='lines+markers', line={"shape": 'hv'}))
fig.add_trace(go.Scatter(x=ccp_alphas, y=test_scores, name='Test Recall', mode='lines+markers', line={"shape": 'hv'}))
fig.update_layout(
xaxis_title='ccp_alphas',
yaxis_title='Recall',
title='Recall vs alpha for training and testing sets',
template='seaborn')
fig.show()
# print results for check overfit
y_predicted_train= dt3.predict(x_train)
y_predicted_test = dt3.predict(x_test)
print(f"Train Accuracy: {accuracy_score(y_train, y_predicted_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_predicted_test)}")
print(f"Train Recall: {recall_score(y_train, y_predicted_train)}")
print(f"Test Recall: {recall_score(y_test, y_predicted_test)}")
Train Accuracy: 0.9875 Test Accuracy: 0.985 Train Recall: 0.9166666666666666 Test Recall: 0.9166666666666666
# find important features
importances = dt3.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8,8))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='green', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
The drawn plot helps us to understand which features are less important and can be removed.
# create better DTs model and draw plots
pipe7_2 = Pipeline([('scaler', StandardScaler()), ('clf', dt3)])
acc_test_7_2, acc_train_7_2, rec_test_7_2, rec_train_7_2 = modeling(
clf=pipe7_2,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='DT 2',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[897 7] [ 8 88]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.99 0.99 0.99 904
1 0.93 0.92 0.92 96
accuracy 0.98 1000
macro avg 0.96 0.95 0.96 1000
weighted avg 0.98 0.98 0.98 1000
Jaccard Score: 0.8543689320388349
Log loss: 0.5406548008367574
This Decision tree model that obtained by post pruning has given us best recall scores on data with 91.7% accuracy.
We obtained very good result that is not overfit and the model has wrongly predicted only 8 people who accepted the loan
# check improve model by change test_size
change_test_size(pipe7_2, x, y, 'DT')
| Model | test_size | ACC_train | ACC_test | Recall_train | Recall_test | |
|---|---|---|---|---|---|---|
| 0 | DT 1 | 0.2 | 0.987500 | 0.985000 | 0.916667 | 0.916667 |
| 2 | DT 3 | 0.3 | 0.987714 | 0.985333 | 0.925595 | 0.909722 |
| 5 | DT 6 | 0.45 | 0.987273 | 0.985333 | 0.935606 | 0.907407 |
| 1 | DT 2 | 0.25 | 0.988267 | 0.983200 | 0.927778 | 0.900000 |
| 4 | DT 5 | 0.4 | 0.983667 | 0.983500 | 0.861111 | 0.843750 |
| 3 | DT 4 | 0.35 | 0.981538 | 0.978857 | 0.817308 | 0.797619 |
So best model by Decision Tree algorithm is the model by test_size = 0.2
# split train and test data by inital test_size=0.2
# stratify used for considering class distribution in spliting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
# Check cross validation on Random Forest model to estimate model performance (Accuracy)
operations = [('RF', RandomForestClassifier())]
pipe8_1 = Pipeline(operations)
Perform_cross_val(pipe8_1, k=10, x=x_train, y=y_train, scoring='accuracy')
-------------------- CV for k=10, scoring=accuracy -------------------- CV mean: 0.9860000000000001 CV results: [0.9825 0.99 0.995 0.985 0.99 0.97 0.9875 0.9825 0.9875 0.99 ]
# Check cross validation on RandomForest model to estimate model performance (Recall)
Perform_cross_val(pipe8_1, k=10, x=x_train, y=y_train, scoring='recall')
-------------------- CV for k=10, scoring=recall -------------------- CV mean: 0.8800944669365723 CV results: [0.84210526 0.89473684 0.92105263 0.89473684 0.89473684 0.78947368 0.94871795 0.82051282 0.87179487 0.92307692]
# create initial RF model without pruning
rf = RandomForestClassifier()
pipe8_1 = Pipeline([('scaler', StandardScaler()), ('clf', rf)])
acc_test_8_1, acc_train_8_1, rec_test_8_1, rec_train_8_1 = modeling(
clf=pipe8_1,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='RF 1',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[902 2] [ 9 87]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.99 1.00 0.99 904
1 0.98 0.91 0.94 96
accuracy 0.99 1000
macro avg 0.98 0.95 0.97 1000
weighted avg 0.99 0.99 0.99 1000
Jaccard Score: 0.8877551020408163
Log loss: 0.3964801872802889
# check accuracy, recall and overfitting
print(f"Train Accuracy: {acc_train_8_1}")
print(f"Test Accuracy: {acc_test_8_1}")
print(f"Train Recall: {rec_train_8_1}")
print(f"Test Recall: {rec_test_8_1}")
Train Accuracy: 1.0 Test Accuracy: 0.989 Train Recall: 1.0 Test Recall: 0.90625
It seems that the model tends to overfit here as well, so to solve this problem, we perform parameter tuning for RF:
# parameter tuning by loops instead grid search because
# gridsearch is very expensive and Time-consuming for this dataset
from numpy import mean
from numpy import std
from numpy import arange
from sklearn.model_selection import RepeatedStratifiedKFold
# get a list of models to evaluate
# explore random forest bootstrap sample size
def get_models1():
print("Explore random forest bootstrap sample size")
models = dict()
# explore ratios from 10% to 100% in 10% increments
for i in arange(0.1, 1.1, 0.1):
key = f'{i:.1f}'
# set max_samples=None to use 100%
if i == 1.0:
i = None
models[key] = RandomForestClassifier(max_samples=i)
return models
# get a list of models to evaluate
# explore random forest number of features effect
def get_models2():
print("Explore random forest number of features effect")
models = dict()
# explore number of features from 1 to 7
for i in range(1,8):
models[str(i)] = RandomForestClassifier(max_features=i)
return models
# get a list of models to evaluate
# explore random forest tree depth effect
def get_models3():
print("Explore random forest tree depth effect")
models = dict()
# consider tree depths from 1 to 7 and None=full
depths = [i for i in range(1,10)] + [None]
for n in depths:
models[str(n)] = RandomForestClassifier(max_depth=n)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the results
scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)
return scores
for func in [get_models1, get_models2, get_models3]:
# get the models to evaluate
models = func()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, x_train, y_train)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print(f">{name:s}, mean:{mean(scores):.3f}, ste:{std(scores):.3f}")
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()
Explore random forest bootstrap sample size >0.1, mean:0.749, ste:0.062 >0.2, mean:0.829, ste:0.059 >0.3, mean:0.849, ste:0.056 >0.4, mean:0.859, ste:0.047 >0.5, mean:0.864, ste:0.051 >0.6, mean:0.865, ste:0.051 >0.7, mean:0.871, ste:0.048 >0.8, mean:0.879, ste:0.047 >0.9, mean:0.877, ste:0.044 >1.0, mean:0.885, ste:0.044
Explore random forest number of features effect >1, mean:0.773, ste:0.061 >2, mean:0.867, ste:0.049 >3, mean:0.885, ste:0.044 >4, mean:0.891, ste:0.043 >5, mean:0.895, ste:0.040 >6, mean:0.897, ste:0.039 >7, mean:0.900, ste:0.037
Explore random forest tree depth effect >1, mean:0.000, ste:0.000 >2, mean:0.107, ste:0.058 >3, mean:0.443, ste:0.094 >4, mean:0.652, ste:0.086 >5, mean:0.792, ste:0.068 >6, mean:0.839, ste:0.051 >7, mean:0.860, ste:0.050 >8, mean:0.874, ste:0.042 >9, mean:0.873, ste:0.050 >None, mean:0.876, ste:0.044
In this case, we can see that by max_sample=None, max_features=7, max_depth=None we have best result
# create initial RF model without pruning
rf = RandomForestClassifier(
criterion='entropy',
n_estimators=100,
max_samples=None,
max_features=7,
max_depth=None,
class_weight='balanced_subsample',)
pipe8_2 = Pipeline([('scaler', StandardScaler()), ('clf', rf)])
acc_test_8_2, acc_train_8_2, rec_test_8_2, rec_train_8_2 = modeling(
clf=pipe8_2,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='RF 2',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[901 3] [ 9 87]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 0.99 1.00 0.99 904
1 0.97 0.91 0.94 96
accuracy 0.99 1000
macro avg 0.98 0.95 0.96 1000
weighted avg 0.99 0.99 0.99 1000
Jaccard Score: 0.8787878787878788
Log loss: 0.432523840669406
print(f"Train Accuracy: {acc_train_8_2}")
print(f"Test Accuracy: {acc_test_8_2}")
print(f"Train Recall: {rec_train_8_2}")
print(f"Test Recall: {rec_test_8_2}")
Train Accuracy: 1.0 Test Accuracy: 0.988 Train Recall: 1.0 Test Recall: 0.90625
# create initial RF model without pruning
from sklearn.preprocessing import RobustScaler
rf = RandomForestClassifier(
criterion='entropy',
n_estimators=150,
max_samples=None,
max_features=7,
max_depth=4,
class_weight='balanced_subsample',
oob_score=True
)
pipe8_3 = Pipeline([('scaler', StandardScaler()), ('clf', rf)])
acc_test_8_3, acc_train_8_3, rec_test_8_3, rec_train_8_3 = modeling(
clf=pipe8_3,
x=x,
y=y,
test_size=0.2,
classes={'Not Accepted':0, 'Accepted':1},
model_name='RF 3',
stratify=True)
-------------------- Shape -------------------- x_train: (4000, 11) y_train: (4000, 1) x_test: (1000, 11) y_test: (1000, 1) --------------- Class Distribution in y_test --------------- > Class=0 : 904/1000 (90.4%) > Class=1 : 96/1000 (9.6%) --------------- Class Distribution in y_train --------------- > Class=0 : 3616/4000 (90.4%) > Class=1 : 384/4000 (9.6%)
-------------------- Confusion Matrix -------------------- [[873 31] [ 3 93]]
-------------------- Classification Report --------------------
precision recall f1-score support
0 1.00 0.97 0.98 904
1 0.75 0.97 0.85 96
accuracy 0.97 1000
macro avg 0.87 0.97 0.91 1000
weighted avg 0.97 0.97 0.97 1000
Jaccard Score: 0.7322834645669292
Log loss: 1.2254842152299834
# print result for check overfit
print(f"Train Accuracy: {acc_train_8_3}")
print(f"Test Accuracy: {acc_test_8_3}")
print(f"Train Recall: {rec_train_8_3}")
print(f"Test Recall: {rec_test_8_3}")
print(f"OOB score: {rf.oob_score_}")
rf.fit(x_train,y_train.ravel())
y_train_predicted=rf.predict(x_train)
y_test_predicted=rf.predict(x_test)
cm = confusion_matrix(y_test, y_test_predicted)
plot_confusion_matrix2(cm=cm, classes={'Not Accepted':0, 'Accepted':1}, )
# print(classification_report(y_test, y_test_predicted))
cm = confusion_matrix(y_train, y_train_predicted)
plot_confusion_matrix2(cm=cm, classes={'Not Accepted':0, 'Accepted':1}, )
# print(classification_report(y_train, y_train_predicted))
Train Accuracy: 0.977 Test Accuracy: 0.966 Train Recall: 0.9869791666666666 Test Recall: 0.96875 OOB score: 0.97275
from typing import OrderedDict
ensemble_clfs = [
(
"RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(
warm_start=True,
oob_score=True,
criterion='entropy',
n_estimators=150,
max_samples=None,
max_depth=4,
class_weight='balanced_subsample',
max_features='sqrt',
),
),
(
"RandomForestClassifier, max_features='log2'",
RandomForestClassifier(
warm_start=True,
oob_score=True,
criterion='entropy',
n_estimators=150,
max_samples=None,
max_depth=4,
class_weight='balanced_subsample',
max_features="log2",
),
),
(
"RandomForestClassifier, max_features=5",
RandomForestClassifier(
warm_start=True,
oob_score=True,
criterion='entropy',
n_estimators=150,
max_samples=None,
max_depth=4,
class_weight='balanced_subsample',
max_features=5,
),
),
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 150
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators + 1, 5):
clf.set_params(n_estimators=i)
clf.fit(x_train, y_train)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:578: UserWarning:
Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:578: UserWarning:
Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:578: UserWarning:
Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\base.py:1151: DataConversionWarning:
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
C:\Users\davoo\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:780: UserWarning:
class_weight presets "balanced" or "balanced_subsample" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use "balanced" weights, use compute_class_weight ("balanced", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.
It doesn't seem overfit, so there is no problem. Very Nice result, because only 7 of the customers who accepted the bank loan were wrongly predicted
# check improve model by change test_size
change_test_size(pipe8_3, x, y, 'RF')
| Model | test_size | ACC_train | ACC_test | Recall_train | Recall_test | |
|---|---|---|---|---|---|---|
| 0 | RF 1 | 0.2 | 0.974750 | 0.964000 | 0.986979 | 0.968750 |
| 2 | RF 3 | 0.3 | 0.978857 | 0.970000 | 0.982143 | 0.951389 |
| 1 | RF 2 | 0.25 | 0.979200 | 0.965600 | 0.986111 | 0.950000 |
| 3 | RF 4 | 0.35 | 0.979077 | 0.970857 | 0.980769 | 0.946429 |
| 4 | RF 5 | 0.4 | 0.977000 | 0.973500 | 0.986111 | 0.942708 |
| 5 | RF 6 | 0.45 | 0.977455 | 0.974222 | 0.981061 | 0.939815 |
So best model by Decision Tree algorithm is the model by test_size = 0.2
We analyzed the Personal Loan campaign data using EDA and by using different models. Finally, according to the above table, the model made with Random Forest is the best model for the intended purpose (lower recall) by Test_Rcall=0.97, Test_Accuracy=0.96.